########################################################################
####              This script creates the Release and Crime         ####
####                versus Predicted Risk plots shown               ####
####                            in Figure II.                       ####
########################################################################
require(plyr)
require(dplyr)
require(caret)
require(ggplot2)
require(pROC)

### load total training set
df <- read.csv('bail_dataset.csv')

##### 1. Divide into Train / Test / Impute #####
set.seed(1234567890)

###
# set aside 20% for test
test.ix     <- createDataPartition(df$fta, p = .2, list = F)
test        <- df[test.ix,]

# remove test from rest
df          <- df[-test.ix,]

# split the remaining half
train.ix    <- createDataPartition(df$fta, p = .5, list = F)

train       <- df[train.ix, ]
impute      <- df[-train.ix,]

# releases
train.rel   <- which(train$released  == 'Definitely')
impute.rel  <- which(impute$released == 'Definitely')  

#### 2. Model Feature Selection ####
Xs <- c('arr_age','arr_counts','arr_county','arr_mo',
        'arr_charge_category','arr_class_fac', 
        'arr_vfo', 'cycle_cc_dv', 'arr_firearm', 'arr_weapon', 
        'arr_minors', 'arr_hate', 'arr_drug', 'arr_1192', 
        'p_ftas',
        'p_felarr', 'p_misdarr',
        'p_vfoarr', 'p_drugarr', 'p_minorsarr',
        'p_weparr', 'p_soarr', 'p_vtl','p_dwi','p_firearm',
        'p_felcon', 'p_misdcon', 'p_vfocon',
        'p_drugcon', 'p_drugfelcon','p_drugmisdcon','p_minorscon',
        'p_wepcon', 'p_socon', 'p_vtlcon','p_dwifelcon','p_dwimisdcon','p_firearmcon'
)

options(na.action = 'na.pass')

#### 3. Predict ####
# make model matrices
train.mat     <- model.matrix(~ ., train[,Xs])[,-1]
impute.mat    <- model.matrix(~ ., impute[,Xs])[,-1]
test.mat      <- model.matrix(~ ., test[,Xs])[,-1]

# read in models
train.fit     <- readRDS('output/models/training_model')
impute.fit    <- readRDS('output/models/imputation_model')

test$phat     <- predict(train.fit,  newdata = test.mat, type = 'prob')[,2]
test$impute   <- predict(impute.fit, newdata = test.mat, type = 'prob')[,2]

# update imputed values to observed for released people
test$impute   <- ifelse(test$released == 'Definitely', test$fta, test$impute)
### sort by p hat desc
test          <- arrange(test, phat)

# calculate ML fta's
ml.fta.n      <- sum(test[1:judge.rel.n,'impute'])
ml.fta.r      <- ml.fta.n/nrow(test) # ML FTA

# calculate ML releases
# first, add a cumulative FTA field
test$cum.fta  <- cumsum(test$impute)
# The equivalent number of ML releases will be the first row where the cumulative FTA rate is greater than or equal to the judges rate
ml.rel.n      <- min(which(test$cum.fta >= judge.fta.n))
ml.rel.r      <- ml.rel.n/nrow(test)

judge.det     <- 1 - judge.rel.r
ml.det        <- 1 - ml.rel.r

# add indicator field for whether the person is released by the algorithm - will come in handy later
# holding release rate constant
test$ml.rel.relconstant                   <- 0
test[1:judge.rel.n, 'ml.rel.relconstant'] <- 1

# holding FTA rate constant
test$ml.rel.ftaconstant                   <- 0
test[1:ml.rel.n,'ml.rel.ftaconstant']     <- 1

test$rel        <- as.numeric(test$released == 'Definitely')
test_releases   <- filter(test, released == 'Definitely')

##########################################
####    Do calculations for plots     ####
##########################################
binz            <- 1000

test$percentile <- cut(test$phat, breaks = quantile(test$phat, seq(0, 1, 1/binz)), labels = F, include.lowest = T)

test_releases$percentile <- cut(test_releases$phat, breaks = quantile(test_releases$phat, seq(0, 1, 1/binz)), 
                                labels = F, include.lowest = T)

test_sum        <- summarise(group_by(test, percentile), all.rel = mean(rel), all.yhat = mean(phat), all.fta = mean(fta), any_p_fta = mean(any_fta))
rel_sum         <- summarise(group_by(test_releases, percentile), rel.rel = mean(rel), rel.yhat = mean(phat), rel.fta = mean(fta))

joint_sum       <- merge(test_sum, rel_sum)

################################
####  Plot Risk vs Crime    ####
################################
ggplot(joint_sum, aes(x = all.yhat, y = rel.fta)) + 
  geom_point(alpha = .5) + 
  geom_abline(slope = 1, intercept = 0, lty = 2) +
  ylim(0, .8) + xlim(0, .8) +
  labs(x = 'Predicted Crime Risk', y = 'Observed Crime Rate')

################################
####  Plot Release vs risk  ####
################################
ggplot(joint_sum, aes(x = all.yhat, y = all.rel)) +
  geom_point(alpha = .5) + geom_smooth(method = 'loess', se = F, col = 'black', lty = 'dashed') +
  xlim(0, .8) + ylim(0, 1) + 
  labs(x = 'Predicted Crime Risk', y = 'Release Rate')










